version: "3.8"

services:
  gpt4all_gpu:
    image: ghcr.io/huggingface/text-generation-inference:0.9.3
    container_name: gpt4all_gpu
    restart: always #restart on error (usually code compilation from save during bad state)
    environment:
      - HUGGING_FACE_HUB_TOKEN=token
      - USE_FLASH_ATTENTION=false
      - MODEL_ID=''
      - NUM_SHARD=1
    command: --model-id $MODEL_ID --num-shard $NUM_SHARD
    volumes:
      - ./:/data
    ports:
      - "8080:80"
    shm_size: 1g
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]